load packages
library(tidyverse)
library(glmnet)
library(caTools)
library(caret)
library(ROCR)
load coded data
coded = read.csv("~/Documents/code/dsnlab/automotion-test-set/tds_artifact_coded_volumes.csv")
load stripe detection data
fileDir = "~/Documents/code/dsnlab/automotion-test-set/output/"
filePattern = "tds_stripes_.*.csv"
file_list = list.files(fileDir, pattern = filePattern)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("stripes")){
temp = read.csv(paste0(fileDir,file))
stripes = data.frame(temp) %>%
rename("volume" = t,
"subjectID" = subject) %>%
select(-file)
rm(temp)
}
# if the merged dataset does exist, append to it
else {
temp_dataset = read.csv(paste0(fileDir,file))
temp_dataset = data.frame(temp_dataset) %>%
rename("volume" = t,
"subjectID" = subject) %>%
select(-file)
stripes = rbind(stripes, temp_dataset)
rm(temp_dataset)
}
}
load global intensities and rps
# define paths and variables
rpDir = '~/Documents/code/dsnlab/automotion-test-set/rp_txt/'
outputDir = '~/Documents/code/tds_auto-motion/auto-motion-output/'
study = "tds2"
rpPattern = "^rp_([0-9]{3})_(.*).txt"
rpCols = c("euclidian_trans","euclidian_rot","euclidian_trans_deriv","euclidian_rot_deriv","trash.rp")
# global intensities
intensities = read.csv(paste0(outputDir,study,'_globalIntensities.csv'))
# edit volume numbers for subject 157, stop3
intensities = intensities %>%
mutate(volume = ifelse(subjectID == 157 & run == "stop3" & volume > 43, volume - 1, volume))
# rp files
file_list = list.files(rpDir, pattern = rpPattern)
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("rp")){
temp = read.table(paste0(rpDir,file))
colnames(temp) = rpCols
rp = data.frame(temp, file = rep(file,count(temp))) %>%
mutate(volume = row_number()) %>%
extract(file,c("subjectID","run"), rpPattern) %>%
mutate(subjectID = as.integer(subjectID))
rm(temp)
}
# if the merged dataset does exist, append to it
else {
temp_dataset = read.table(paste0(rpDir,file))
colnames(temp_dataset) = rpCols
temp_dataset = data.frame(temp_dataset, file = rep(file,count(temp_dataset))) %>%
mutate(volume = row_number()) %>%
extract(file,c("subjectID","run"), rpPattern) %>%
mutate(subjectID = as.integer(subjectID))
rp = rbind(rp, temp_dataset)
rm(temp_dataset)
}
}
join dataframes
joined = left_join(stripes, coded, by = c("subjectID", "run", "volume")) %>%
left_join(., intensities, by = c("subjectID", "run", "volume")) %>%
left_join(., rp, by = c("subjectID", "run", "volume")) %>%
mutate(striping = ifelse(is.na(striping), 0, striping),
intensity = ifelse(is.na(intensity), 0, intensity),
tile = paste0("tile_",tile),
artifact = ifelse(striping > 1, 1, 0)) %>%
group_by(subjectID, run, tile) %>%
mutate(Diff.mean = volMean - lag(volMean),
Diff.sd = volSD - lag(volSD)) %>%
spread(tile, freqtile_power)
split the data
set.seed(101)
sample = sample.split(joined$artifact, SplitRatio = .75)
training = subset(joined, sample == TRUE)
testing = subset(joined, sample == FALSE)
machine learning
tidy data
train.ml = training %>%
group_by(subjectID, run) %>%
mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
gather(tile,freqtile_power, starts_with("tile")) %>%
mutate(tile = paste0(tile,"_c")) %>%
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
select(-striping, - intensity, -trash.rp, -fsl.volume, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
select(subjectID, run, volume, artifact, everything())
test.ml = testing %>%
group_by(subjectID, run) %>%
mutate(Diff.mean = ifelse(is.na(Diff.mean),0,Diff.mean),
Diff.sd = ifelse(is.na(Diff.sd),0,Diff.sd)) %>%
gather(tile,freqtile_power, starts_with("tile")) %>%
mutate(tile = paste0(tile,"_c")) %>%
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
select(-striping, - intensity, -trash.rp, -fsl.volume, -volMean, -volSD, -euclidian_rot, -euclidian_trans) %>%
select(subjectID, run, volume, artifact, everything())
use lasso logistic regression to fit beta weights for each predictor
training
# subset predictors and criterion
x_train = as.matrix(train.ml[,-c(1,2,3,4)])
y_train = as.double(as.matrix(train.ml[, 4]))
# run xval to determine lambda
cv.train <- cv.glmnet(x_train, y_train, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')
plot(cv.train)

plot(cv.train$glmnet.fit, xvar="lambda", label=TRUE)

cv.train$lambda.min
## [1] 0.0001055024
cv.train$lambda.1se
## [1] 0.01331289
coef(cv.train, s=cv.train$lambda.min)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.34966479
## euclidian_trans_deriv -1.96821531
## euclidian_rot_deriv 1.25812915
## Diff.mean 0.01511855
## Diff.sd 0.03814063
## tile_1_c -68.96785531
## tile_10_c 7750.81268518
## tile_11_c -5214.91044479
## tile_2_c -349.90077443
## tile_3_c .
## tile_4_c -1015.84159332
## tile_5_c -1210.50826519
## tile_6_c 176.50824285
## tile_7_c 1794.76700119
## tile_8_c 3217.08847243
## tile_9_c -811.96646874
coef(cv.train, s=cv.train$lambda.1se)
## 16 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -4.24641815
## euclidian_trans_deriv .
## euclidian_rot_deriv .
## Diff.mean .
## Diff.sd 0.01718502
## tile_1_c -17.31491278
## tile_10_c 3019.45382532
## tile_11_c .
## tile_2_c .
## tile_3_c .
## tile_4_c .
## tile_5_c .
## tile_6_c .
## tile_7_c .
## tile_8_c .
## tile_9_c .
# test on sample
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")
# plot cutoff v. accuracy
predicted = prediction(pred_train, y_train, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])
ggplot(perf.df, aes(cut, acc)) +
geom_line()

# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])
ggplot(perf.df, aes(fpr, tpr)) +
geom_line()

# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
geom_line()

ggplot(perf.df, aes(x = cut)) +
geom_line(aes(y = sens, color = "sensitivity")) +
geom_line(aes(y = spec, color = "specificity"))

perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
## [1] 0.02640321
max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
## [1] 1.738462
# confusion matrix
pred_train = predict(cv.train, newx = x_train, s=cv.train$lambda.1se, type="response")
pred_train[pred_train > .03] = 1
pred_train[pred_train < .03] = 0
confusionMatrix(pred_train, y_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5530 34
## 1 127 85
##
## Accuracy : 0.9721
## 95% CI : (0.9675, 0.9762)
## No Information Rate : 0.9794
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.5004
## Mcnemar's Test P-Value : 0.0000000000004149
##
## Sensitivity : 0.9775
## Specificity : 0.7143
## Pos Pred Value : 0.9939
## Neg Pred Value : 0.4009
## Prevalence : 0.9794
## Detection Rate : 0.9574
## Detection Prevalence : 0.9633
## Balanced Accuracy : 0.8459
##
## 'Positive' Class : 0
##
testing
# subset predictors and criterion
x_test = as.matrix(test.ml[,-c(1,2,3,4)])
y_test = as.double(as.matrix(test.ml[, 4]))
# test on holdout sample
pred_test = predict(cv.train, newx = x_test, s=cv.train$lambda.1se, type="response")
pred_test[pred_test > .03] = 1
pred_test[pred_test < .03] = 0
# confusion matrix
confusionMatrix(pred_test, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1846 7
## 1 40 33
##
## Accuracy : 0.9756
## 95% CI : (0.9677, 0.982)
## No Information Rate : 0.9792
## P-Value [Acc > NIR] : 0.8828
##
## Kappa : 0.5726
## Mcnemar's Test P-Value : 0.000003046
##
## Sensitivity : 0.9788
## Specificity : 0.8250
## Pos Pred Value : 0.9962
## Neg Pred Value : 0.4521
## Prevalence : 0.9792
## Detection Rate : 0.9585
## Detection Prevalence : 0.9621
## Balanced Accuracy : 0.9019
##
## 'Positive' Class : 0
##
svm
training
train.svm = train.ml[,-c(1,2,3)] %>%
mutate(artifact = ifelse(artifact == 1, "yes","no"),
artifact = as.factor(artifact))
# specify control parameters
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)
# run initial model
set.seed(101)
svmFit = train(artifact ~ .,
data = train.svm,
method = "svmLinear",
trControl = fitControl,
preProcess = c("center", "scale"),
tuneLength = 10,
metric = "ROC",
verbose = FALSE)
svmFit$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 157
##
## Objective Function Value : -150.248
## Training error : 0.010215
## Probability model included.
# predict model
train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
select(-no)
# plot cutoff v. accuracy
predicted = prediction(train_pred, train.svm$artifact, label.ordering = NULL)
perf = performance(predicted, measure = "acc")
perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])
ggplot(perf.df, aes(cut, acc)) +
geom_line()

# plot false v. true positive rate
perf = performance(predicted, measure = "tpr", x.measure = "fpr")
perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])
ggplot(perf.df, aes(fpr, tpr)) +
geom_line()

# plot specificity v. sensitivity
perf = performance(predicted, measure = "sens", x.measure = "spec")
perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
ggplot(perf.df, aes(spec, sens)) +
geom_line()

ggplot(perf.df, aes(x = cut)) +
geom_line(aes(y = sens, color = "sensitivity")) +
geom_line(aes(y = spec, color = "specificity"))

perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
## [1] 0.02344554
max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
## [1] 1.743969
# cut and assess accuracy in training sample
train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
select(-no)
train_pred = as.matrix(train_pred)
train_pred[train_pred > .07] = "yes"
train_pred[train_pred < .07] = "no"
confusionMatrix(train_pred, train.svm$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5612 36
## yes 45 83
##
## Accuracy : 0.986
## 95% CI : (0.9826, 0.9888)
## No Information Rate : 0.9794
## P-Value [Acc > NIR] : 0.0001236
##
## Kappa : 0.6649
## Mcnemar's Test P-Value : 0.3740628
##
## Sensitivity : 0.9920
## Specificity : 0.6975
## Pos Pred Value : 0.9936
## Neg Pred Value : 0.6484
## Prevalence : 0.9794
## Detection Rate : 0.9716
## Detection Prevalence : 0.9778
## Balanced Accuracy : 0.8448
##
## 'Positive' Class : no
##
testing
test.svm = test.ml[,-c(1,2,3)] %>%
mutate(artifact = ifelse(artifact == 1, "yes","no"),
artifact = as.factor(artifact))
# cut and assess accuracy in test sample
test_pred = predict(svmFit, newdata = test.svm, type="prob") %>%
select(-no)
test_pred = as.matrix(test_pred)
test_pred[test_pred > .07] = "yes"
test_pred[test_pred < .07] = "no"
confusionMatrix(test_pred, test.svm$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1867 11
## yes 19 29
##
## Accuracy : 0.9844
## 95% CI : (0.9778, 0.9895)
## No Information Rate : 0.9792
## P-Value [Acc > NIR] : 0.05977
##
## Kappa : 0.6512
## Mcnemar's Test P-Value : 0.20124
##
## Sensitivity : 0.9899
## Specificity : 0.7250
## Pos Pred Value : 0.9941
## Neg Pred Value : 0.6042
## Prevalence : 0.9792
## Detection Rate : 0.9694
## Detection Prevalence : 0.9751
## Balanced Accuracy : 0.8575
##
## 'Positive' Class : no
##
save models
setwd("~/Documents/code/dsnlab/automotion-test-set")
saveRDS(cv.train, "model_log_TDS.rds")
saveRDS(svmFit, "model_svm_TDS.rds")
weighted model
# # create model weights (they sum to one)
# model_weights = ifelse(train.svm$artifact == "yes",
# (1/table(train.svm$artifact)[1]) * 0.5,
# (1/table(train.svm$artifact)[2]) * 0.5)
#
# # use the same seed to ensure same cross-validation splits
# fitControl$seeds = svmFit$control$seeds
#
# svmFit_weighted = train(artifact ~ .,
# data = train.svm,
# method = "svmLinear",
# trControl = fitControl,
# preProcess = c("center", "scale"),
# tuneLength = 10,
# metric = "ROC",
# verbose = FALSE,
# weights = model_weights)
#
# svmFit_weighted$finalModel
#
# test_pred_weighted = predict(svmFit_weighted, newdata = test.svm, type="prob") %>%
# select(-no)
# test_pred_weighted = as.matrix(test_pred_weighted)
# test_pred_weighted[test_pred_weighted > .07] = "yes"
# test_pred_weighted[test_pred_weighted < .07] = "no"
# confusionMatrix(test_pred_weighted, test.svm$artifact)
# # determine best cost parameter
# grid <- expand.grid(C = c(0, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 5))
# set.seed(3233)
# svm_Linear_Grid <- train(artifact ~ .,
# data = train.svm, method = "svmLinear",
# trControl=fitControl,
# preProcess = c("center", "scale"),
# tuneGrid = grid,
# tuneLength = 10)
# svm_Linear_Grid
# plot(svm_Linear_Grid)
# test_pred_grid = predict(svm_Linear_Grid, newdata = test.svm)
# confusionMatrix(test_pred_grid, test.svm$artifact)
auto-motion process
training
train.man = training %>%
gather(tile, freqtile_power, starts_with("tile")) %>%
filter(tile %in% c("tile_1", "tile_10")) %>%
# code trash based on mean, sd, and rp
ungroup %>%
mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
# code volumes above mean thresholds as trash
upper.mean = meanDiff.mean + 2*sdDiff.mean,
lower.mean = meanDiff.mean - 2*sdDiff.mean,
trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
upper.sd = meanDiff.sd + 2*sdDiff.sd,
lower.sd = meanDiff.sd - 2*sdDiff.sd,
trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
# code volumes with more than +/- .25mm translation or rotation in Euclidian distance
trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
# code trash based on striping
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
# combine trash
mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%
# recode as trash if volume behind and in front are both marked as trash
mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code first volume as trash if second volume is trash
mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code hits
mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
ifelse(trash.combined == 0 & (artifact == 1), "neg",
ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits)) %>%
gather(tile, freqtile_power_c, c("tile_1", "tile_10"))
testing
test.man = testing %>%
gather(tile, freqtile_power, starts_with("tile")) %>%
filter(tile %in% c("tile_1", "tile_10")) %>%
# code trash based on mean, sd, and rp
ungroup %>%
mutate(meanDiff.mean = mean(Diff.mean, na.rm=TRUE),
sdDiff.mean = sd(Diff.mean, na.rm=TRUE),
meanDiff.sd = mean(Diff.sd, na.rm=TRUE),
sdDiff.sd = sd(Diff.sd, na.rm=TRUE),
# code volumes above mean thresholds as trash
upper.mean = meanDiff.mean + 2*sdDiff.mean,
lower.mean = meanDiff.mean - 2*sdDiff.mean,
trash.mean = ifelse(Diff.mean > upper.mean | Diff.mean < lower.mean, 1, 0),
upper.sd = meanDiff.sd + 2*sdDiff.sd,
lower.sd = meanDiff.sd - 2*sdDiff.sd,
trash.sd = ifelse(Diff.sd > upper.sd | Diff.sd < lower.sd, 1, 0),
# code volumes with more than +/- .25mm translation or rotation in Euclidian distance
trash.rp.tr = ifelse(euclidian_trans_deriv > .25 | euclidian_trans_deriv < -.25, 1, 0),
trash.rp.rot = ifelse(euclidian_rot_deriv > .25 | euclidian_rot_deriv < -.25, 1, 0)) %>%
select(-meanDiff.mean, -meanDiff.sd, -sdDiff.mean, -sdDiff.sd) %>%
# code trash based on striping
group_by(subjectID, run, tile) %>%
mutate(freqtile_power_c = freqtile_power - mean(freqtile_power, na.rm=TRUE)) %>%
ungroup() %>%
select(-freqtile_power) %>%
spread(tile,freqtile_power_c) %>%
mutate(trash.stripe = ifelse(tile_1 < -.035 & tile_10 > .00025, 1, 0)) %>%
# combine trash
mutate(trash.combined = ifelse(trash.stripe == 1, 1, 0),
trash.combined = ifelse((trash.rp.tr + trash.rp.rot + trash.mean + trash.sd) > 1, 1, trash.combined)) %>%
# recode as trash if volume behind and in front are both marked as trash
mutate(trash.combined = ifelse(trash.combined == 0 & lag(trash.combined) == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code first volume as trash if second volume is trash
mutate(trash.combined = ifelse(volume == 1 & lead(trash.combined) == 1, 1, trash.combined)) %>%
# code hits
mutate(hits = ifelse(trash.combined == 1 & (artifact == 1), "hit",
ifelse(trash.combined == 0 & (artifact == 1), "neg",
ifelse(trash.combined == 1 & (artifact == 0), "pos", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits)) %>%
gather(tile, freqtile_power_c, c("tile_1", "tile_10"))
compare hit rates
training data
# select only one set of observations and code correct rejections
train.tab = train.man %>%
filter(tile == "tile_1") %>%
mutate(hits.tot = ifelse(hits %in% "hit", "hit",
ifelse(hits %in% "neg", "neg",
ifelse(hits %in% "pos", "pos",
ifelse(is.na(hits), "cor.rej", hits)))))
lasso logistic regression
confusionMatrix(pred_train, y_train)$table
## Reference
## Prediction 0 1
## 0 5530 34
## 1 127 85
confusionMatrix(pred_train, y_train)$overall[1]
## Accuracy
## 0.972126
confusionMatrix(pred_train, y_train)$byClass[11]
## Balanced Accuracy
## 0.8459178
svm
confusionMatrix(train_pred, train.svm$artifact)$table
## Reference
## Prediction no yes
## no 5612 36
## yes 45 83
confusionMatrix(train_pred, train.svm$artifact)$overall[1]
## Accuracy
## 0.9859765
confusionMatrix(train_pred, train.svm$artifact)$byClass[11]
## Balanced Accuracy
## 0.8447621
manual
table(train.tab$hits.tot)
##
## cor.rej hit neg pos
## 5609 90 28 49
cor.rej = table(train.tab$hits.tot)[[1]]
hit = table(train.tab$hits.tot)[[2]]
neg = table(train.tab$hits.tot)[[3]]
pos = table(train.tab$hits.tot)[[4]]
total = cor.rej + hit + neg + pos
sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.88"
test data
test.tab = test.man %>%
filter(tile == "tile_1") %>%
mutate(hits.tot = ifelse(hits %in% "hit", "hit",
ifelse(hits %in% "neg", "neg",
ifelse(hits %in% "pos", "pos",
ifelse(is.na(hits), "cor.rej", NA)))))
lasso logistic regression
confusionMatrix(pred_test, y_test)$table
## Reference
## Prediction 0 1
## 0 1846 7
## 1 40 33
confusionMatrix(pred_test, y_test)$overall[1]
## Accuracy
## 0.9755971
confusionMatrix(pred_test, y_test)$byClass[11]
## Balanced Accuracy
## 0.9018955
svm
confusionMatrix(test_pred, test.svm$artifact)$table
## Reference
## Prediction no yes
## no 1867 11
## yes 19 29
confusionMatrix(test_pred, test.svm$artifact)$overall[1]
## Accuracy
## 0.9844237
confusionMatrix(test_pred, test.svm$artifact)$byClass[11]
## Balanced Accuracy
## 0.8574629
manual
table(test.tab$hits.tot)
##
## cor.rej hit neg pos
## 1874 35 4 13
cor.rej = table(test.tab$hits.tot)[[1]]
hit = table(test.tab$hits.tot)[[2]]
neg = table(test.tab$hits.tot)[[3]]
pos = table(test.tab$hits.tot)[[4]]
total = cor.rej + hit + neg + pos
sprintf('Accuracy: %s', round((cor.rej + hit) / total,2))
## [1] "Accuracy: 0.99"
# balanced accuracy
sprintf('Balanced accuracy: %s', round(((cor.rej / (cor.rej + pos)) + (hit / (hit + neg))) / 2,2))
## [1] "Balanced accuracy: 0.95"
plot composite
combine training and test data
full_man = bind_rows(train.man, test.man)
plot machine learning
logistic regression
full_log = bind_rows(train.ml,test.ml)
# re-run lasso logistic regression on full sample
x = as.matrix(full_log[,-c(1,2,3,4)])
y = as.double(as.matrix(full_log[, 4]))
pred = predict(cv.train, newx = x, s=cv.train$lambda.1se, type="response")
pred[pred > .03] = 1
pred[pred < .03] = 0
confusionMatrix(pred, y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7376 41
## 1 167 118
##
## Accuracy : 0.973
## 95% CI : (0.9691, 0.9765)
## No Information Rate : 0.9794
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.5188
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.9779
## Specificity : 0.7421
## Pos Pred Value : 0.9945
## Neg Pred Value : 0.4140
## Prevalence : 0.9794
## Detection Rate : 0.9577
## Detection Prevalence : 0.9630
## Balanced Accuracy : 0.8600
##
## 'Positive' Class : 0
##
svm
full_svm = bind_rows(train.ml,test.ml) %>%
mutate(artifact = ifelse(artifact == 1, "yes","no"),
artifact = as.factor(artifact))
# re-run svm on full sample
full_pred = predict(svmFit, newdata = full_svm, type="prob") %>%
select(-no)
full_pred = as.matrix(full_pred)
full_pred[full_pred > .07] = "yes"
full_pred[full_pred < .07] = "no"
confusionMatrix(full_pred, full_svm$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7479 47
## yes 64 112
##
## Accuracy : 0.9856
## 95% CI : (0.9827, 0.9881)
## No Information Rate : 0.9794
## P-Value [Acc > NIR] : 0.00003083
##
## Kappa : 0.6613
## Mcnemar's Test P-Value : 0.1288
##
## Sensitivity : 0.9915
## Specificity : 0.7044
## Pos Pred Value : 0.9938
## Neg Pred Value : 0.6364
## Prevalence : 0.9794
## Detection Rate : 0.9710
## Detection Prevalence : 0.9771
## Balanced Accuracy : 0.8480
##
## 'Positive' Class : no
##
manual
full_man1 = full_man %>%
filter(tile == "tile_1")
confusionMatrix(full_man1$trash.combined, full_man1$artifact)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7386 32
## 1 62 125
##
## Accuracy : 0.9876
## 95% CI : (0.9849, 0.99)
## No Information Rate : 0.9794
## P-Value [Acc > NIR] : 0.00000002905
##
## Kappa : 0.7205
## Mcnemar's Test P-Value : 0.00278
##
## Sensitivity : 0.9917
## Specificity : 0.7962
## Pos Pred Value : 0.9957
## Neg Pred Value : 0.6684
## Prevalence : 0.9794
## Detection Rate : 0.9712
## Detection Prevalence : 0.9754
## Balanced Accuracy : 0.8939
##
## 'Positive' Class : 0
##
plot and compare models
data.plot.log = bind_cols(full_log, as.data.frame(y), as.data.frame(pred)) %>%
mutate(hits = ifelse(y == 1 & `1` == 1, "hit",
ifelse(y == 0 & `1` == 1, "pos",
ifelse(y == 1 & `1` == 0, "neg", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits))
data.plot.svm = bind_cols(full_svm, as.data.frame(full_pred)) %>%
rename("full_pred" = yes) %>%
mutate(hits = ifelse(artifact == "yes" & full_pred == "yes", "hit",
ifelse(artifact == "no" & full_pred == "yes", "pos",
ifelse(artifact == "yes" & full_pred == "no", "neg", NA))),
label = ifelse(regexpr('.*', hits), as.character(volume), ''),
hits = as.factor(hits))
plot.comp = full_man %>%
filter(tile == "tile_1") %>%
select(subjectID, run, volume, euclidian_trans_deriv, hits) %>%
rename("auto" = hits) %>%
left_join(data.plot.log, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
select(subjectID, run, volume, euclidian_trans_deriv, auto, hits) %>%
rename("log" = hits) %>%
left_join(data.plot.svm, by = c("subjectID", "run", "volume", "euclidian_trans_deriv")) %>%
select(subjectID, run, volume, euclidian_trans_deriv, auto, log, hits) %>%
rename("svm" = hits) %>%
gather(model, hits, c("auto", "log", "svm")) %>%
mutate(label = ifelse(regexpr('.*', hits), as.character(volume), ''))
ggplot(plot.comp, aes(x = volume, y = euclidian_trans_deriv)) +
geom_line(size = .25) +
geom_point(data = subset(plot.comp, !is.na(hits)), aes(color = hits), size = 2.5) +
geom_text(aes(label = label), size = 1.5) +
facet_wrap(~ subjectID + run + model, ncol = 9, scales = "free") +
scale_color_manual(values = c("#3B9AB2","#EBCC2A","#F21A00")) +
theme(axis.text.x = element_text(size = 6))
